In [1]:
import numpy as np
import sklearn
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib 


def is_covariance(x):
    # checks if x is a valid covariance matrix (symmetric and pos-def)
    symmetric= np.all(x==x.T)
    pos_def=np.all(np.linalg.eigvals(x) > 0)
    return symmetric and pos_def



def plot_data(x,title,eigen=False):
    # plot 3d data
    # x must be of size (n,3)
    # if eigen=True, also plot eigenvectors of cov(x), with the corresponding eigenvalue
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(x[:,0],x[:,1],zs=x[:,2],alpha=0.1)
    limit_max=np.max(x)+1
    limit_min=np.min(x)-1
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_zlabel("z")
    ax.set_xlim(limit_min, limit_max)
    ax.set_ylim(limit_min, limit_max)
    ax.set_zlim(limit_min, limit_max)
    ax.set_title(title)
    if eigen==True:
        n,features=x.shape
        mu=np.mean(x,axis=0)
        pca = PCA(n_components=features)
        pca.fit(x)
        eigenvectors=pca.components_ # each row is an eigenvector
        eigenvalues=pca.explained_variance_ # eigenvalues
        
        scale=np.abs(x).max()
        #scaled_eigenvectors=eigenvectors/(eigenvalues+0.2)*scale
        scaled_eigenvectors=eigenvectors*scale/4
        ax.scatter([mu[0]],[mu[1]],zs=[mu[2]],color="green",s=150)
        endpoints=scaled_eigenvectors+mu
        for i in range(features):
            ax.plot([mu[0], endpoints[i,0]], [mu[1],endpoints[i,1]], [mu[2], endpoints[i,2]], color='red', alpha=0.8, lw=2)
            ax.text(endpoints[i,0],endpoints[i,1],endpoints[i,2],  '$\lambda_{%d}$=%0.2f' % (i,eigenvalues[i]), size=8, zorder=1)


Using matplotlib backend: TkAgg

Generate data to simulate a dataset of samples $x$ in which all features/columns (3) could be collected. x has size $n$ by $features$.


In [2]:
#mean and std of multivariate normal dist to generate samples
mu=np.array([5,0,-2])
σ=np.array([[9,1, -1],
            [1, 3, -2],
            [-1, -2,2],])
if not is_covariance(σ):
    print("Warning: σ is not a valid covariance matrix (not symmetric or positive definite)")

n=1000 # number of samples
x=np.random.multivariate_normal(mu,σ,size=(n,))

#plot generated data
plt.close("all")
plot_data(x,"data in original space",eigen=True)
plt.show()

Transform the generated data $x$ to a new basis. The basis is given by the eigenvectors of $cov(x)$. In this new basis, the eigenvectors are the same as the $x$, $y$, $z$ axis vectors $(1,0,0)$, $(0,1,0)$, etc.


In [3]:
pca_exact = PCA(n_components=3) # since x has 3 features, this PCA model does not do compression
pca_exact.fit(x) # calculate eigen decomposition of cov(x)

x_transformed=pca_exact.transform(x) #encode x with the eigenvectors as basis
plot_data(x_transformed,"x in natural (eigenvectors) space",eigen=True)

#save the eigenvectors and eigenvalues
eigenvectors=pca_exact.components_
eigenvalues=pca_exact.explained_variance_

Generate another dataset $y$ with the same distribution as $x$ (this is a very strong assumption!)


In [4]:
y=np.random.multivariate_normal(mu,σ,size=(n,))
plot_data(y,"y in original space",eigen=True)

Lets simulate the fact that for $y$ we can't measure all values. In this case, we will create y_missing, which only has 2 features


In [5]:
y_missing=y[:,0:2]
plt.figure()
plt.scatter(y_missing[:,0],y_missing[:,1])
plt.title("y_missing in original space (2d)")


Out[5]:
Text(0.5,1,'y_missing in original space (2d)')

Now, lets assume that we can recover the last feature of y_missing using information from the eigendecomposition of $cov(x)$.

We perform PCA on $x$, but use only the two most important eigenvectors (those with bigger eigenvalues). pca_reconstruction allows us to perform a forward transform from $R^3$ to $R^2$ (compression) and an inverse transform from $R^2$ to $R^3$ (reconstruction). This can also be interpreted as going from the canonical basis to the eigenbasis and viceversa.


In [9]:
pca_reconstruction=PCA(n_components=2)
pca_reconstruction.fit(x)
print(pca_reconstruction.components_)
print(eigenvectors)


[[ 0.94125744  0.25425767 -0.22223293]
 [ 0.33698043 -0.74984978  0.56935884]]
[[ 0.94125744  0.25425767 -0.22223293]
 [ 0.33698043 -0.74984978  0.56935884]
 [-0.02187746 -0.61080139 -0.79148154]]

We still can't reconstruct y from y_missing with pca_reconstruction because it is encoded with the canonical basis. We first need to encode it in the eigenbasis. To do that we will augment y_missing with a new feature with value equal to the mean of that feature (a reasonable assumption), and encode it.


In [12]:
y_augmented=np.copy(y_missing)
y3=np.zeros((n,1))+mu[2]
y_augmented=np.hstack([y_missing,y3])
y_eigen=pca_exact.transform(y_augmented)

least_eigenvalue_index=np.argmin(eigenvalues)
y_eigen_2d=y_eigen[:,np.arange(3)!=least_eigenvalue_index]

In [13]:
y_reconstructed=pca_reconstruction.inverse_transform(y_eigen_2d)
plot_data(y_reconstructed, "y_reconstructed",eigen=True)

In [14]:
mean_reconstruction_error=((y_reconstructed-y)**2).sum()/n
print(mean_reconstruction_error)


1.1738605988